from sklearn import preprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
#Get dataset
mydata = pd.read_csv("C:/Users/abeba/IAF 603/IAF 603/kc_house_data.csv",sep = ',')
mydata.head()
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
5 rows × 21 columns
mydata.tail()
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21608 | 263000018 | 20140521T000000 | 360000.0 | 3 | 2.50 | 1530 | 1131 | 3.0 | 0 | 0 | ... | 8 | 1530 | 0 | 2009 | 0 | 98103 | 47.6993 | -122.346 | 1530 | 1509 |
| 21609 | 6600060120 | 20150223T000000 | 400000.0 | 4 | 2.50 | 2310 | 5813 | 2.0 | 0 | 0 | ... | 8 | 2310 | 0 | 2014 | 0 | 98146 | 47.5107 | -122.362 | 1830 | 7200 |
| 21610 | 1523300141 | 20140623T000000 | 402101.0 | 2 | 0.75 | 1020 | 1350 | 2.0 | 0 | 0 | ... | 7 | 1020 | 0 | 2009 | 0 | 98144 | 47.5944 | -122.299 | 1020 | 2007 |
| 21611 | 291310100 | 20150116T000000 | 400000.0 | 3 | 2.50 | 1600 | 2388 | 2.0 | 0 | 0 | ... | 8 | 1600 | 0 | 2004 | 0 | 98027 | 47.5345 | -122.069 | 1410 | 1287 |
| 21612 | 1523300157 | 20141015T000000 | 325000.0 | 2 | 0.75 | 1020 | 1076 | 2.0 | 0 | 0 | ... | 7 | 1020 | 0 | 2008 | 0 | 98144 | 47.5941 | -122.299 | 1020 | 1357 |
5 rows × 21 columns
mydata.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| id | 21613.0 | 4.580302e+09 | 2.876566e+09 | 1.000102e+06 | 2.123049e+09 | 3.904930e+09 | 7.308900e+09 | 9.900000e+09 |
| price | 21613.0 | 5.400881e+05 | 3.671272e+05 | 7.500000e+04 | 3.219500e+05 | 4.500000e+05 | 6.450000e+05 | 7.700000e+06 |
| bedrooms | 21613.0 | 3.370842e+00 | 9.300618e-01 | 0.000000e+00 | 3.000000e+00 | 3.000000e+00 | 4.000000e+00 | 3.300000e+01 |
| bathrooms | 21613.0 | 2.114757e+00 | 7.701632e-01 | 0.000000e+00 | 1.750000e+00 | 2.250000e+00 | 2.500000e+00 | 8.000000e+00 |
| sqft_living | 21613.0 | 2.079900e+03 | 9.184409e+02 | 2.900000e+02 | 1.427000e+03 | 1.910000e+03 | 2.550000e+03 | 1.354000e+04 |
| sqft_lot | 21613.0 | 1.510697e+04 | 4.142051e+04 | 5.200000e+02 | 5.040000e+03 | 7.618000e+03 | 1.068800e+04 | 1.651359e+06 |
| floors | 21613.0 | 1.494309e+00 | 5.399889e-01 | 1.000000e+00 | 1.000000e+00 | 1.500000e+00 | 2.000000e+00 | 3.500000e+00 |
| waterfront | 21613.0 | 7.541757e-03 | 8.651720e-02 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
| view | 21613.0 | 2.343034e-01 | 7.663176e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 4.000000e+00 |
| condition | 21613.0 | 3.409430e+00 | 6.507430e-01 | 1.000000e+00 | 3.000000e+00 | 3.000000e+00 | 4.000000e+00 | 5.000000e+00 |
| grade | 21613.0 | 7.656873e+00 | 1.175459e+00 | 1.000000e+00 | 7.000000e+00 | 7.000000e+00 | 8.000000e+00 | 1.300000e+01 |
| sqft_above | 21613.0 | 1.788391e+03 | 8.280910e+02 | 2.900000e+02 | 1.190000e+03 | 1.560000e+03 | 2.210000e+03 | 9.410000e+03 |
| sqft_basement | 21613.0 | 2.915090e+02 | 4.425750e+02 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 5.600000e+02 | 4.820000e+03 |
| yr_built | 21613.0 | 1.971005e+03 | 2.937341e+01 | 1.900000e+03 | 1.951000e+03 | 1.975000e+03 | 1.997000e+03 | 2.015000e+03 |
| yr_renovated | 21613.0 | 8.440226e+01 | 4.016792e+02 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.015000e+03 |
| zipcode | 21613.0 | 9.807794e+04 | 5.350503e+01 | 9.800100e+04 | 9.803300e+04 | 9.806500e+04 | 9.811800e+04 | 9.819900e+04 |
| lat | 21613.0 | 4.756005e+01 | 1.385637e-01 | 4.715590e+01 | 4.747100e+01 | 4.757180e+01 | 4.767800e+01 | 4.777760e+01 |
| long | 21613.0 | -1.222139e+02 | 1.408283e-01 | -1.225190e+02 | -1.223280e+02 | -1.222300e+02 | -1.221250e+02 | -1.213150e+02 |
| sqft_living15 | 21613.0 | 1.986552e+03 | 6.853913e+02 | 3.990000e+02 | 1.490000e+03 | 1.840000e+03 | 2.360000e+03 | 6.210000e+03 |
| sqft_lot15 | 21613.0 | 1.276846e+04 | 2.730418e+04 | 6.510000e+02 | 5.100000e+03 | 7.620000e+03 | 1.008300e+04 | 8.712000e+05 |
mydata.drop(['id', 'date','sqft_living15','sqft_lot15'], axis = 1, inplace = True)
mydata.isna().sum()
price 0 bedrooms 0 bathrooms 0 sqft_living 0 sqft_lot 0 floors 0 waterfront 0 view 0 condition 0 grade 0 sqft_above 0 sqft_basement 0 yr_built 0 yr_renovated 0 zipcode 0 lat 0 long 0 dtype: int64
mydata[pd.isna(mydata['waterfront'])][['waterfront', 'lat', 'long']]
| waterfront | lat | long |
|---|
NA_Count = pd.DataFrame({'Sum of NA':mydata.isnull().sum()}).sort_values(by=['Sum of NA'],ascending=[0])
NA_Count['Percentage'] = NA_Count['Sum of NA']/mydata.shape[1]
sum(NA_Count['Percentage'])
0.0
mydata[mydata['waterfront']==1.0][['waterfront', 'lat', 'long']]
| waterfront | lat | long | |
|---|---|---|---|
| 49 | 1 | 47.4041 | -122.451 |
| 230 | 1 | 47.4497 | -122.375 |
| 246 | 1 | 47.6338 | -122.072 |
| 264 | 1 | 47.4683 | -122.438 |
| 300 | 1 | 47.6053 | -122.077 |
| ... | ... | ... | ... |
| 19984 | 1 | 47.6318 | -122.071 |
| 20325 | 1 | 47.5283 | -122.205 |
| 20767 | 1 | 47.5724 | -122.104 |
| 21201 | 1 | 47.5285 | -122.205 |
| 21576 | 1 | 47.5943 | -122.110 |
163 rows × 3 columns
# we don't need because no na's values
#mydata['waterfront'] = mydata['waterfront'].fillna(0.0)
#mydata.info()
mydata.view.value_counts()
0 19489 2 963 3 510 1 332 4 319 Name: view, dtype: int64
#we don't need it
#mydata = mydata.dropna(subset=['view'])
mydata.yr_renovated.value_counts()
0 20699
2014 91
2013 37
2003 36
2007 35
...
1948 1
1954 1
1951 1
1944 1
1959 1
Name: yr_renovated, Length: 70, dtype: int64
never_renovated = mydata[mydata['yr_renovated'] == 0.0]['yr_built']
never_renovated.describe()
count 20699.000000 mean 1972.395092 std 28.855465 min 1900.000000 25% 1953.000000 50% 1977.000000 75% 1998.000000 max 2015.000000 Name: yr_built, dtype: float64
mydata['waterfront']=mydata['waterfront'].astype('bool')
mydata['yr_renovated'] = mydata['yr_renovated'].astype('category')
mydata['basement_present'] = mydata['sqft_basement'].apply(lambda x: 1 if x > 0 else 0) # Indicate whether there is a basement or not
mydata['renovated'] = mydata['yr_renovated'].apply(lambda x: 1 if x > 0 else 0) # 1 if the house has been renovated
mydata = mydata.drop(['sqft_basement','sqft_above'],axis=1)
plt.figure(figsize=[10,10])
sns.countplot(x=mydata['bathrooms'],data=mydata) # outliers present
<AxesSubplot:xlabel='bathrooms', ylabel='count'>
# drop data with zero bedrooms and bathrooms and bedrooms outlier
mydata = mydata.drop(mydata[mydata.bedrooms == 0].index)
mydata = mydata.drop(mydata[mydata.bedrooms == 33].index)
mydata = mydata.drop(mydata[mydata.bathrooms == 0].index)
#Distribution of house prices
fig=plt.figure(figsize=[5,5])
sns.distplot(mydata['price']) # outliers present
print('Distribution of Prices')
C:\Users\abeba\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Distribution of Prices
fig=plt.figure(figsize=[15,8])
sns.histplot(mydata['price'],bins=60)
plt.ticklabel_format(style='plain')
plt.xlabel("Price ($)")
Text(0.5, 0, 'Price ($)')
# Correlation matrix
plt.figure(figsize = (25,20))
sns.heatmap(mydata.corr(), annot = True, cmap="RdBu")
plt.show()
import statsmodels.api as sm
fig=sm.qqplot(mydata['price'])
plt.show()
fig=sm.qqplot(np.log(mydata['price']))
plt.show()
mydata['logprice']= np.log(mydata['price'])
mydata.head()
| price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | yr_built | yr_renovated | zipcode | lat | long | basement_present | renovated | logprice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | False | 0 | 3 | 7 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 0 | 0 | 12.309982 |
| 1 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | False | 0 | 3 | 7 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1 | 1 | 13.195614 |
| 2 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | False | 0 | 3 | 6 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 0 | 0 | 12.100712 |
| 3 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | False | 0 | 5 | 7 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1 | 0 | 13.311329 |
| 4 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | False | 0 | 3 | 8 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 0 | 0 | 13.142166 |
#after log transformation it look like approximately normal
fig=plt.figure(figsize=[15,8])
sns.histplot(mydata['logprice'],bins=60)
plt.ticklabel_format(style='plain')
plt.xlabel("logPrice ($)")
Text(0.5, 0, 'logPrice ($)')
mydata.groupby("waterfront").mean().T
| waterfront | False | True |
|---|---|---|
| price | 531667.833061 | 1.661876e+06 |
| bedrooms | 3.372370 | 3.300613e+00 |
| bathrooms | 2.111569 | 2.677914e+00 |
| sqft_living | 2072.028181 | 3.173687e+03 |
| sqft_lot | 15021.710586 | 2.537183e+04 |
| floors | 1.493001 | 1.641104e+00 |
| view | 0.207437 | 3.766871e+00 |
| condition | 3.408809 | 3.533742e+00 |
| grade | 7.649466 | 8.773006e+00 |
| yr_built | 1971.067793 | 1.962190e+03 |
| zipcode | 98077.809406 | 9.809653e+04 |
| lat | 47.560260 | 4.753736e+01 |
| long | -122.213463 | -1.222816e+02 |
| basement_present | 0.391359 | 6.012270e-01 |
| renovated | 0.040685 | 2.576687e-01 |
| logprice | 13.040108 | 1.410262e+01 |
plt.figure(figsize = (8,10))
mydata.corr()["price"].sort_values().drop("price").plot(kind = "barh")
<AxesSubplot:>
pip install plotly_express==0.4.0
Requirement already satisfied: plotly_express==0.4.0 in c:\users\abeba\anaconda3\lib\site-packages (0.4.0) Requirement already satisfied: statsmodels>=0.9.0 in c:\users\abeba\anaconda3\lib\site-packages (from plotly_express==0.4.0) (0.12.2) Requirement already satisfied: pandas>=0.20.0 in c:\users\abeba\anaconda3\lib\site-packages (from plotly_express==0.4.0) (1.2.4) Requirement already satisfied: scipy>=0.18 in c:\users\abeba\anaconda3\lib\site-packages (from plotly_express==0.4.0) (1.6.2) Requirement already satisfied: plotly>=4.0.0 in c:\users\abeba\anaconda3\lib\site-packages (from plotly_express==0.4.0) (5.4.0) Requirement already satisfied: numpy>=1.11 in c:\users\abeba\anaconda3\lib\site-packages (from plotly_express==0.4.0) (1.20.1) Requirement already satisfied: patsy>=0.5 in c:\users\abeba\anaconda3\lib\site-packages (from plotly_express==0.4.0) (0.5.1) Requirement already satisfied: pytz>=2017.3 in c:\users\abeba\anaconda3\lib\site-packages (from pandas>=0.20.0->plotly_express==0.4.0) (2021.1) Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\abeba\anaconda3\lib\site-packages (from pandas>=0.20.0->plotly_express==0.4.0) (2.8.1) Requirement already satisfied: six in c:\users\abeba\anaconda3\lib\site-packages (from patsy>=0.5->plotly_express==0.4.0) (1.15.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\abeba\anaconda3\lib\site-packages (from plotly>=4.0.0->plotly_express==0.4.0) (8.0.1) Note: you may need to restart the kernel to use updated packages.
import matplotlib.pyplot as plt
import plotly.express as px
smap = px.scatter_mapbox(mydata, lat="lat", lon="long", zoom=8.5, color="logprice",hover_name="zipcode",
hover_data=["sqft_lot", "sqft_living", "price", "condition"],
labels={"sqft_lot":"Lot Size","sqft_living": "Home size", "price": "Sale Price"},
color_continuous_scale= "viridis_r", opacity=.8)
smap.update_layout(mapbox_style="carto-positron", height=500)
smap.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
smap.show()
plt.figure(figsize=(12, 6))
for numerical_feature in ['sqft_living','yr_built', 'grade','sqft_lot']:
sns.relplot(kind='scatter', x='price', y=numerical_feature, data = mydata, aspect=3)
<Figure size 864x432 with 0 Axes>
# The same thing as above, just on a log scale for price
plt.figure(figsize=(12, 6))
for numerical_feature in ['sqft_living','yr_built', 'grade','sqft_lot']:
sns.relplot(kind='scatter', x='logprice', y=numerical_feature, data = mydata, aspect=3)
plt.xlabel("Log Price")
<Figure size 864x432 with 0 Axes>
plt.figure(121,figsize=(7,7))
sns.scatterplot(x=mydata["bedrooms"],y=mydata["price"])
sns.scatterplot(x=mydata["bathrooms"],y=mydata["price"])
sns.lineplot(x=(mydata["bedrooms"]+mydata["bathrooms"]),y=mydata["price"],color="black");
plt.legend(["bedrooms+bathrooms","bathrooms","bedrooms"],loc="best");
plt.xlabel("Bedrooms&Bathrooms")
Text(0.5, 0, 'Bedrooms&Bathrooms')
plt.figure(figsize = (12, 8))
sns.scatterplot(x = 'price', y = 'long', data = mydata)
<AxesSubplot:xlabel='price', ylabel='long'>
plt.figure(figsize = (12, 8))
sns.scatterplot(x = 'logprice', y = 'long', data = mydata)
<AxesSubplot:xlabel='logprice', ylabel='long'>
plt.figure(figsize = (12, 8))
sns.scatterplot(x = 'price', y = 'lat', data = mydata)
<AxesSubplot:xlabel='price', ylabel='lat'>
plt.figure(figsize = (12, 8))
sns.scatterplot(x = 'logprice', y = 'lat', data = mydata)
<AxesSubplot:xlabel='logprice', ylabel='lat'>
plt.figure(figsize = (12, 8))
sns.scatterplot(x = 'long', y = 'lat', data = mydata, hue = 'price',
palette = 'flag', edgecolor = None, alpha = 0.5)
<AxesSubplot:xlabel='long', ylabel='lat'>
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
fig1 = [go.Box(y=mydata.bedrooms,name="Bedrooms",marker=dict(color="green"),hoverinfo="name+y")]
fig2 = [go.Box(y=mydata.bathrooms,name="Bathrooms",marker=dict(color="yellow"),hoverinfo="name+y")]
layout2 = go.Layout(title="Bedrooms vs Bathrooms ",yaxis=dict(range=[0,13]))
fig3 = go.Figure(data= fig1+fig2,layout=layout2)
iplot(fig3)
plt.figure(figsize=(12, 6))
for categorical_feature in ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'grade']:
sns.catplot(kind='box', x=categorical_feature, y='price', data=mydata, aspect=3)
<Figure size 864x432 with 0 Axes>
plt.figure(figsize=(12, 6))
sns.catplot(kind='box', x="waterfront", y='logprice', data=mydata, aspect=3)
<seaborn.axisgrid.FacetGrid at 0x7f1677672750>
<Figure size 864x432 with 0 Axes>
sns.lineplot(x= "yr_built", y= "price", data=mydata)
<AxesSubplot:xlabel='yr_built', ylabel='price'>
import matplotlib.pyplot as plt
plt.figure(figsize=[7,7]).add_subplot(211).set_title('No Of Bedrooms in King county house')
sns.lineplot(x=mydata['bedrooms'],y=mydata['price'],data=mydata,legend='full')
plt.figure(figsize=[7,7]).add_subplot(212).set_title('No Of Bathrooms in King county house')
sns.lineplot(x=mydata['bathrooms'],y=mydata['price'],data=mydata,legend='full',color='green')
<AxesSubplot:title={'center':'No Of Bathrooms in King county house'}, xlabel='bathrooms', ylabel='price'>
plt.figure(figsize=(11,6))
sns.lineplot(x='grade', y='price', data= mydata)
plt.show()
# Line plot after log transformation of price
plt.figure(figsize=(11,6))
sns.lineplot(x='grade', y='logprice', data= mydata)
plt.show()
mydata['zipcode'].value_counts()
98103 601
98038 589
98115 583
98052 574
98117 553
...
98102 104
98010 100
98024 80
98148 57
98039 50
Name: zipcode, Length: 70, dtype: int64
mydata['zipcode'].unique()
plt.figure(figsize=[10,7])
sns.displot(x=mydata['zipcode'].unique())
<seaborn.axisgrid.FacetGrid at 0x18c5fddaac0>
<Figure size 720x504 with 0 Axes>
# on column zip code
mydata['zipcode'].unique().max() # 98199
mydata['zipcode'].unique().min() # 98001
mydata['zipcode'].unique()
array([98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, 98146,
98038, 98007, 98115, 98107, 98126, 98019, 98103, 98002, 98133,
98040, 98092, 98030, 98119, 98112, 98052, 98027, 98117, 98058,
98001, 98056, 98166, 98023, 98070, 98148, 98105, 98042, 98008,
98059, 98122, 98144, 98004, 98005, 98034, 98075, 98116, 98010,
98118, 98199, 98032, 98045, 98102, 98077, 98108, 98168, 98177,
98065, 98029, 98006, 98109, 98022, 98033, 98155, 98024, 98011,
98031, 98106, 98072, 98188, 98014, 98055, 98039], dtype=int64)
zipcode = mydata.groupby(['zipcode']).mean()
zipcode.head()
| price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | yr_built | lat | long | basement_present | renovated | logprice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| zipcode | ||||||||||||||||
| 98001 | 2.811949e+05 | 3.393352 | 2.011773 | 1903.783934 | 14967.002770 | 1.430748 | 0.000000 | 0.102493 | 3.335180 | 7.296399 | 1980.867036 | 47.309106 | -122.270704 | 0.240997 | 0.022161 | 12.493124 |
| 98002 | 2.342840e+05 | 3.326633 | 1.839196 | 1627.743719 | 7517.633166 | 1.334171 | 0.000000 | 0.010050 | 3.743719 | 6.693467 | 1967.773869 | 47.308780 | -122.213357 | 0.155779 | 0.030151 | 12.337702 |
| 98003 | 2.941113e+05 | 3.357143 | 2.047321 | 1928.882143 | 10603.096429 | 1.310714 | 0.000000 | 0.214286 | 3.371429 | 7.542857 | 1976.885714 | 47.315741 | -122.310054 | 0.357143 | 0.010714 | 12.539857 |
| 98004 | 1.355927e+06 | 3.854890 | 2.527603 | 2909.022082 | 13104.220820 | 1.432177 | 0.003155 | 0.305994 | 3.495268 | 8.687697 | 1971.470032 | 47.616183 | -122.205189 | 0.466877 | 0.104101 | 14.008379 |
| 98005 | 8.101649e+05 | 3.851190 | 2.424107 | 2656.803571 | 19928.785714 | 1.279762 | 0.000000 | 0.095238 | 3.696429 | 8.488095 | 1969.744048 | 47.611532 | -122.167268 | 0.517857 | 0.023810 | 13.560104 |
wfzips = mydata[mydata['waterfront'] == 1].groupby('zipcode').mean().sort_values('price')
wfzips = wfzips.reset_index()
plt.figure(figsize=[20,10])
zip = sns.barplot(x=wfzips.zipcode,y=wfzips.price)
plt.xticks(rotation=45)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
[Text(0, 0, '98004'),
Text(1, 0, '98006'),
Text(2, 0, '98008'),
Text(3, 0, '98023'),
Text(4, 0, '98027'),
Text(5, 0, '98028'),
Text(6, 0, '98033'),
Text(7, 0, '98034'),
Text(8, 0, '98039'),
Text(9, 0, '98040'),
Text(10, 0, '98052'),
Text(11, 0, '98056'),
Text(12, 0, '98070'),
Text(13, 0, '98074'),
Text(14, 0, '98075'),
Text(15, 0, '98105'),
Text(16, 0, '98115'),
Text(17, 0, '98116'),
Text(18, 0, '98118'),
Text(19, 0, '98125'),
Text(20, 0, '98136'),
Text(21, 0, '98144'),
Text(22, 0, '98146'),
Text(23, 0, '98155'),
Text(24, 0, '98166'),
Text(25, 0, '98177'),
Text(26, 0, '98178'),
Text(27, 0, '98198'),
Text(28, 0, '98199')])
numerical = mydata[['price','bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition','grade','logprice','renovated','basement_present']]
numerical = numerical.corr().drop('price')
numerical[['price', 'logprice']].sort_values('logprice')
| price | logprice | |
|---|---|---|
| condition | 0.036000 | 0.038838 |
| sqft_lot | 0.089885 | 0.100029 |
| renovated | 0.126076 | 0.114103 |
| waterfront | 0.266438 | 0.174690 |
| basement_present | 0.180076 | 0.212152 |
| floors | 0.256831 | 0.310668 |
| view | 0.397323 | 0.346594 |
| bedrooms | 0.315966 | 0.350866 |
| bathrooms | 0.525915 | 0.551251 |
| sqft_living | 0.701921 | 0.695170 |
| grade | 0.667935 | 0.703706 |
| logprice | 0.891720 | 1.000000 |
plt.figure(figsize=(20,20))
numerical[['price']].sort_values('price').plot(kind = "barh")
<AxesSubplot:>
<Figure size 1440x1440 with 0 Axes>
mydata.corr()
| price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | yr_built | zipcode | lat | long | basement_present | renovated | logprice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| price | 1.000000 | 0.315966 | 0.525915 | 0.701921 | 0.089885 | 0.256831 | 0.266438 | 0.397323 | 0.036000 | 0.667935 | 0.053995 | -0.053443 | 0.306772 | 0.022103 | 0.180076 | 0.126076 | 0.891720 |
| bedrooms | 0.315966 | 1.000000 | 0.527870 | 0.593178 | 0.033602 | 0.183707 | -0.006869 | 0.082437 | 0.023440 | 0.366174 | 0.160736 | -0.158588 | -0.011595 | 0.136561 | 0.164290 | 0.018864 | 0.350866 |
| bathrooms | 0.525915 | 0.527870 | 1.000000 | 0.755755 | 0.088368 | 0.502574 | 0.063742 | 0.188381 | -0.126443 | 0.665834 | 0.507166 | -0.204778 | 0.024301 | 0.224889 | 0.162800 | 0.050060 | 0.551251 |
| sqft_living | 0.701921 | 0.593178 | 0.755755 | 1.000000 | 0.173449 | 0.353941 | 0.103853 | 0.284704 | -0.059397 | 0.762776 | 0.318140 | -0.199793 | 0.052178 | 0.241200 | 0.204362 | 0.055034 | 0.695170 |
| sqft_lot | 0.089885 | 0.033602 | 0.088368 | 0.173449 | 1.000000 | -0.004824 | 0.021631 | 0.074897 | -0.008806 | 0.114726 | 0.052939 | -0.129582 | -0.085507 | 0.230222 | -0.035216 | 0.007786 | 0.100029 |
| floors | 0.256831 | 0.183707 | 0.502574 | 0.353941 | -0.004824 | 1.000000 | 0.023752 | 0.028801 | -0.264013 | 0.458783 | 0.489175 | -0.059522 | 0.049280 | 0.125912 | -0.256507 | 0.006340 | 0.310668 |
| waterfront | 0.266438 | -0.006869 | 0.063742 | 0.103853 | 0.021631 | 0.023752 | 1.000000 | 0.401970 | 0.016624 | 0.082886 | -0.026157 | 0.030274 | -0.014302 | -0.041908 | 0.037190 | 0.093281 | 0.174690 |
| view | 0.397323 | 0.082437 | 0.188381 | 0.284704 | 0.074897 | 0.028801 | 0.401970 | 1.000000 | 0.046041 | 0.251722 | -0.053649 | 0.084629 | 0.005884 | -0.078120 | 0.181939 | 0.104093 | 0.346594 |
| condition | 0.036000 | 0.023440 | -0.126443 | -0.059397 | -0.008806 | -0.264013 | 0.016624 | 0.046041 | 1.000000 | -0.146854 | -0.361555 | 0.002835 | -0.015209 | -0.105800 | 0.135094 | -0.060294 | 0.038838 |
| grade | 0.667935 | 0.366174 | 0.665834 | 0.762776 | 0.114726 | 0.458783 | 0.082886 | 0.251722 | -0.146854 | 1.000000 | 0.447854 | -0.185761 | 0.113602 | 0.200324 | 0.051289 | 0.013848 | 0.703706 |
| yr_built | 0.053995 | 0.160736 | 0.507166 | 0.318140 | 0.052939 | 0.489175 | -0.026157 | -0.053649 | -0.361555 | 0.447854 | 1.000000 | -0.347199 | -0.148340 | 0.409974 | -0.167806 | -0.225240 | 0.080625 |
| zipcode | -0.053443 | -0.158588 | -0.204778 | -0.199793 | -0.129582 | -0.059522 | 0.030274 | 0.084629 | 0.002835 | -0.185761 | -0.347199 | 1.000000 | 0.266729 | -0.564253 | 0.162841 | 0.064308 | -0.038814 |
| lat | 0.306772 | -0.011595 | 0.024301 | 0.052178 | -0.085507 | 0.049280 | -0.014302 | 0.005884 | -0.015209 | 0.113602 | -0.148340 | 0.266729 | 1.000000 | -0.135340 | 0.138323 | 0.029312 | 0.448884 |
| long | 0.022103 | 0.136561 | 0.224889 | 0.241200 | 0.230222 | 0.125912 | -0.041908 | -0.078120 | -0.105800 | 0.200324 | 0.409974 | -0.564253 | -0.135340 | 1.000000 | -0.236979 | -0.068336 | 0.050919 |
| basement_present | 0.180076 | 0.164290 | 0.162800 | 0.204362 | -0.035216 | -0.256507 | 0.037190 | 0.181939 | 0.135094 | 0.051289 | -0.167806 | 0.162841 | 0.138323 | -0.236979 | 1.000000 | 0.048905 | 0.212152 |
| renovated | 0.126076 | 0.018864 | 0.050060 | 0.055034 | 0.007786 | 0.006340 | 0.093281 | 0.104093 | -0.060294 | 0.013848 | -0.225240 | 0.064308 | 0.029312 | -0.068336 | 0.048905 | 1.000000 | 0.114103 |
| logprice | 0.891720 | 0.350866 | 0.551251 | 0.695170 | 0.100029 | 0.310668 | 0.174690 | 0.346594 | 0.038838 | 0.703706 | 0.080625 | -0.038814 | 0.448884 | 0.050919 | 0.212152 | 0.114103 | 1.000000 |
mydata.corr()['price'].sort_values(ascending=False)
price 1.000000 logprice 0.891720 sqft_living 0.701921 grade 0.667935 bathrooms 0.525915 view 0.397323 bedrooms 0.315966 lat 0.306772 waterfront 0.266438 floors 0.256831 basement_present 0.180076 renovated 0.126076 sqft_lot 0.089885 yr_built 0.053995 condition 0.036000 long 0.022103 zipcode -0.053443 Name: price, dtype: float64
# the correlation between price and zipcode are negatives i want drop other negative correlations valus as well
#mydata = mydata.drop('zipcode', axis = 1)
from sklearn import linear_model
import statsmodels.api as sm
mydata.isna().sum()
price 0 bedrooms 0 bathrooms 0 sqft_living 0 sqft_lot 0 floors 0 waterfront 0 view 0 condition 0 grade 0 yr_built 0 yr_renovated 0 zipcode 0 lat 0 long 0 basement_present 0 renovated 0 logprice 0 dtype: int64
features = mydata.iloc[:, 3:].columns.tolist()
features
['sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'basement_present', 'renovated', 'logprice']
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# Linear regression afte before logtransformation of house price
y=mydata['price']
x=mydata[['bedrooms','bathrooms','sqft_living','sqft_lot','waterfront','floors',
'view','condition','grade','yr_built','zipcode',
'basement_present','yr_renovated','renovated','lat','long',]]
#splitting the data into training and tesing
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.3,random_state=101)
print(f'Total # of sample in whole dataset: {len(x)}')
print("*****"*10)
print(f'Total # of sample in train dataset: {len(xtrain)}')
print(f'Shape of X_train: {xtrain.shape}')
print("*****"*10)
print(f'Total # of sample in test dataset: {len(xtest)}')
print(f'Shape of X_test: {xtest.shape}')
Total # of sample in whole dataset: 21596 ************************************************** Total # of sample in train dataset: 15117 Shape of X_train: (15117, 16) ************************************************** Total # of sample in test dataset: 6479 Shape of X_test: (6479, 16)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
xtrain=scaler.fit_transform(xtrain)
xtest=scaler.transform(xtest)
#linear regression model and training
from sklearn.linear_model import LinearRegression
LR=LinearRegression()
LR.fit(xtrain, ytrain)
LinearRegression()
#predicting price for test set
pred=LR.predict(xtest)
print(pred)
[ 593302.10515081 627256.38956754 614938.96263215 ... 2230877.98561131 242497.31859545 153760.24733677]
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
print('Training r2_Score',LR.score(xtrain,ytrain))
print('Testing r2_Score ',LR.score(xtest,ytest))
#print('training mean square error', LR.mean_squared_error(xtrain,ytrain))
Training r2_Score 0.7001096503215504 Testing r2_Score 0.700356888839464
#Results
train_score = LR.score(xtrain, ytrain)
print(f'Train score of trained model: {train_score*100}')
test_score = LR.score(xtest, ytest)
print(f'Test score of trained model: {test_score*100}')
Train score of trained model: 70.01096503215504 Test score of trained model: 70.0356888839464
from sklearn import metrics
metrics.mean_squared_error(ytest,pred)
40896849875.91457
#Adjusted R-square is used to provide us with a more unbiased picture as it punishes multicollinearity and gives a fair evaluation score.
#To calculate Adjusted R2 we first calculate the variance of Y_test
var_test = ytest.var()
var_test
136506268768.45143
#We use the above-calculated variance to compute Adjusted R-square.
Adj_rsquare = 1-(metrics.mean_squared_error(ytest,pred)/var_test)
Adj_rsquare
0.7004031371974146
#Here we use numpy and metrics to calculate the RMSE.
np.sqrt(metrics.mean_squared_error(ytest,pred))
202229.69583103905
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=150,max_depth=7)
rf = rf.fit(xtrain,ytrain)
ypred_rf = rf.predict(xtest)
print('Training r2_score',rf.score(xtrain,ytrain))
print('Testing r2_score',rf.score(xtest,ytest))
Training r2_score 0.871967281897047 Testing r2_score 0.8326822332211335
#predicting price for test set for randomforest
pred_rf=rf.predict(xtest)
print(pred_rf)
[ 649797.38479962 609106.40390504 599899.54238376 ... 2088067.79816544 292141.75298522 211142.11111809]
# Linear regression after logtransformation of the house price
y=mydata['logprice']
x=mydata[['bedrooms','bathrooms','sqft_living','sqft_lot','waterfront','floors',
'view','condition','grade','yr_built', 'zipcode',
'basement_present','yr_renovated','renovated','lat','long',]]
#splitting the data into training and tesing
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.3,random_state=101)
print(f'Total # of sample in whole dataset: {len(x)}')
print("*****"*10)
print(f'Total # of sample in train dataset: {len(xtrain)}')
print(f'Shape of X_train: {xtrain.shape}')
print("*****"*10)
print(f'Total # of sample in test dataset: {len(xtest)}')
print(f'Shape of X_test: {xtest.shape}')
Total # of sample in whole dataset: 21596 ************************************************** Total # of sample in train dataset: 15117 Shape of X_train: (15117, 16) ************************************************** Total # of sample in test dataset: 6479 Shape of X_test: (6479, 16)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
xtrain=scaler.fit_transform(xtrain)
xtest=scaler.transform(xtest)
x.shape
(21596, 16)
#linear regression model and training
from sklearn.linear_model import LinearRegression
LR=LinearRegression()
LR.fit(xtrain, ytrain)
LinearRegression()
#predicting price for test set
pred2=LR.predict(xtest)
print(pred2)
[13.22904297 13.29652011 13.20238589 ... 15.01265069 12.57571415 12.35630799]
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
print('Training r2_Score',LR.score(xtrain,ytrain))
print('Testing r2_Score ',LR.score(xtest,ytest))
#print('training mean square error', LR.mean_squared_error(xtrain,ytrain))
Training r2_Score 0.7653584755754196 Testing r2_Score 0.7688093898812656
#Results
train_score = LR.score(xtrain, ytrain)
print(f'Train score of trained model: {train_score*100}')
test_score = LR.score(xtest, ytest)
print(f'Test score of trained model: {test_score*100}')
Train score of trained model: 76.53584755754196 Test score of trained model: 76.88093898812656
from sklearn import metrics
metrics.mean_squared_error(ytest,pred2)
0.06374758088504465
#Adjusted R-square is used to provide us with a more unbiased picture as it punishes multicollinearity and gives a fair evaluation score.
#To calculate Adjusted R2 we first calculate the variance of Y_test
var_test = ytest.var()
var_test
0.2757785944644265
#We use the above-calculated variance to compute Adjusted R-square.
Adj_rsquare = 1-(metrics.mean_squared_error(ytest,pred2)/var_test)
Adj_rsquare
0.7688450729512016
#Here we use numpy and metrics to calculate the RMSE.
np.sqrt(metrics.mean_squared_error(ytest,pred2))
0.2524828328521459
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=150,max_depth=7)
rf = rf.fit(xtrain,ytrain)
ypred_rf = rf.predict(xtest)
print('Training r2_score',rf.score(xtrain,ytrain))
print('Testing r2_score',rf.score(xtest,ytest))
Training r2_score 0.8559218916314213 Testing r2_score 0.8378526408328691
#predicting price for test set
pred_rf=rf.predict(xtest)
print(pred_rf)
[13.26707911 13.28209623 13.24882309 ... 14.14610032 12.50440697 12.18978532]